import nltk
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import f1_score, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.preprocessing import LabelBinarizer
df_train_full = pd.read_pickle("./pan19_df_clean_train_feateng.pkl")
df_test_full = pd.read_pickle("./pan19_df_clean_test_feateng.pkl")
print(f"train size: {df_train_full.shape}, test size: {df_test_full.shape}")
def feature(df) :
df['word_count'] = df['tweet'].apply(lambda x : len(x.split()))
df['char_count'] = df['tweet'].apply(lambda x : len(x.replace(" ","")))
df['word_density'] = df['word_count'] / (df['char_count'] + 1)
df['total_length'] = df['tweet'].apply(len)
df['capitals'] = df['tweet'].apply(lambda tweet: sum(1 for c in tweet if c.isupper()))
df['caps_vs_length'] = df.apply(lambda row: float(row['capitals'])/float(row['total_length']),axis=1)
df['num_exclamation_marks'] =df['tweet'].apply(lambda x: x.count('!'))
df['num_question_marks'] = df['tweet'].apply(lambda x: x.count('?'))
df['num_punctuation'] = df['tweet'].apply(lambda x: sum(x.count(w) for w in '.,;:'))
df['num_symbols'] = df['tweet'].apply(lambda x: sum(x.count(w) for w in '*&$%'))
df['num_unique_words'] = df['tweet'].apply(lambda x: len(set(w for w in x.split())))
df['words_vs_unique'] = df['num_unique_words'] / df['word_count']
df["word_unique_percent"] = df["num_unique_words"]*100/df['word_count']
df['num_retweet'] = df['clean_tweet'].apply(lambda x: x.count('rt'))
df['num_url'] = df['clean_tweet'].apply(lambda x: x.count('URL_TOKEN'))
df['num_number'] = df['clean_tweet'].apply(lambda x: x.count('NUM_TOKEN'))
return df
## PAZI ovdje uzimam samo subset za lakse igranje
#num_examples = 10_000
#df_train = feature(df_train_full.loc[list(range(5*num_examples)), :])
#df_test = feature(df_test_full.loc[list(range(num_examples)), :])
df_train = feature(df_train_full)
df_test = feature(df_test_full)
df = pd.concat((df_train_full.copy(), df_test_full.copy()))
print(f"total size: {df.shape}")
df_train.info()
df_train
import plotly.graph_objects as go
fig = go.Figure(
data=[
go.Scatterpolar(
r=[df_train.loc[df_train["bot"] == "bot"]['num_url'].mean() * 100,
df_train.loc[df_train["bot"] == "bot"]['num_number'].mean() * 100,
df_train.loc[df_train["bot"] == "bot"]['num_retweet'].mean() * 100],
theta=['urls', "nums", "Number of retweets"],
fill='toself',
line=dict(color='red'),
name="bot stats means", subplot="polar3"),
go.Scatterpolar(
r=[df_train.loc[df_train["bot"] == "human"]['num_url'].mean() * 100,
df_train.loc[df_train["bot"] == "human"]['num_number'].mean() * 100,
df_train.loc[df_train["bot"] == "human"]['num_retweet'].mean() * 100],
theta=['urls', "nums", "Number of retweets"],
fill='toself',
line=dict(color='blue'),
name="human stats means", subplot="polar3"),
go.Scatterpolar(
r=[df_train.loc[df_train["bot"] == "bot"]['total_length'].mean(),
df_train.loc[df_train["bot"] == "bot"]['char_count'].mean(),
df_train.loc[df_train["bot"] == "bot"]['word_count'].mean()],
theta=['Total_Lenght', 'Charcount', "word count"],
fill='toself',
line=dict(color='brown'),
name="bot stats means", subplot="polar3"),
go.Scatterpolar(
r=[df_train.loc[df_train["bot"] == "human"]['total_length'].mean(),
df_train.loc[df_train["bot"] == "human"]['char_count'].mean(),
df_train.loc[df_train["bot"] == "human"]['word_count'].mean()],
theta=['Total_Lenght', 'Charcount', "word count"],
fill='toself',
line=dict(color='magenta'),
name="human stats means", subplot="polar3")
],
layout=go.Layout(
polar3=dict(
domain=dict(
x=[1, 1],
y=[1, 1]
),
radialaxis=dict(visible=True,)),
polar2=dict(
domain=dict(
x=[0, 0.3],
y=[0, 0.45]
),
radialaxis=dict(visible=True,)),
polar=dict(
domain=dict(
x=[0.33, 0.6525],
y=[0, 0.45]
),
radialaxis=dict(visible=True,)),
polar4=dict(
domain=dict(
x=[0.33, 0.6525],
y=[0.55, 1]
),
radialaxis=dict(visible=True,)),
polar5=dict(
domain=dict(
x=[0.6775, 1],
y=[0, 0.45]
),
radialaxis=dict(visible=True,)),
polar6=dict(
domain=dict(
x=[0.6775, 1],
y=[0.55, 1]
),
radialaxis=dict(visible=True,))
)
)
fig.show()
def draw_word_features(df_train):
plot = go.Figure(
data=[
go.Scatterpolar(
r=[df_train.loc[df_train["bot"] == "bot"]['word_count'].mean(),
df_train.loc[df_train["bot"] == "bot"]['char_count'].mean(),
df_train.loc[df_train["bot"] == "bot"]['word_density'].mean(),
df_train.loc[df_train["bot"] == "bot"]['total_length'].mean(),
df_train.loc[df_train["bot"] == "bot"]['capitals'].mean(),
df_train.loc[df_train["bot"] == "bot"]['caps_vs_length'].mean()],
theta=['mean word count', "mean char count", "mean word denisty",
"mean tweet length", "mean capital letter count", "mean capital vs length"],
fill='toself',
line=dict(color='red'),
name="bot means", subplot="polar6"),
go.Scatterpolar(
r=[df_train.loc[df_train["bot"] == "human"]['word_count'].mean(),
df_train.loc[df_train["bot"] == "human"]['char_count'].mean(),
df_train.loc[df_train["bot"] == "human"]['word_density'].mean(),
df_train.loc[df_train["bot"] == "human"]['total_length'].mean(),
df_train.loc[df_train["bot"] == "human"]['capitals'].mean(),
df_train.loc[df_train["bot"] == "human"]['caps_vs_length'].mean()],
theta=['mean word count', "mean char count", "mean word denisty",
"mean tweet length", "mean capital letter count", "mean capital vs length"],
fill='toself',
line=dict(color='blue'),
name="human means", subplot="polar6"),
],
layout=go.Layout(
polar=dict(
domain=dict(
x=[0, 1],
y=[0, 1]
),
radialaxis=dict(visible=True,)),
polar2=dict(
domain=dict(
x=[0, 1],
y=[0, 1]
),
radialaxis=dict(visible=True,)),
polar3=dict(
domain=dict(
x=[0, 1],
y=[0, 1]
),
radialaxis=dict(visible=True,)),
polar4=dict(
domain=dict(
x=[0, 1],
y=[0, 1]
),
radialaxis=dict(visible=True,)),
polar5=dict(
domain=dict(
x=[0, 1],
y=[0, 1]
),
radialaxis=dict(visible=True,)),
polar6=dict(
domain=dict(
x=[0, 1],
y=[0, 1]
),
radialaxis=dict(visible=True,)),
)
)
return plot
fig = draw_word_features(df)
fig.show()
def draw_url_rt_num(df_train):
plot = go.Figure(
data=[
go.Scatterpolar(
r=[df_train.loc[df_train["bot"] == "bot"]['num_url'].mean(),
df_train.loc[df_train["bot"] == "bot"]['num_number'].mean(),
df_train.loc[df_train["bot"] == "bot"]['num_retweet'].mean()],
theta=['mean URL', "mean NUM", "mean RT"],
fill='toself',
line=dict(color='red'),
name="bot means", subplot="polar3"),
go.Scatterpolar(
r=[df_train.loc[df_train["bot"] == "human"]['num_url'].mean(),
df_train.loc[df_train["bot"] == "human"]['num_number'].mean(),
df_train.loc[df_train["bot"] == "human"]['num_retweet'].mean()],
theta=['mean URL', "mean NUM", "mean RT"],
fill='toself',
line=dict(color='blue'),
name="human means", subplot="polar3"),
],
layout=go.Layout(
polar=dict(
domain=dict(
x=[0, 1],
y=[0, 1]
),
radialaxis=dict(visible=True,)),
polar2=dict(
domain=dict(
x=[0, 1],
y=[0, 1]
),
radialaxis=dict(visible=True,)),
polar3=dict(
domain=dict(
x=[0, 1],
y=[0, 1]
),
radialaxis=dict(visible=True,)),
)
)
return plot
fig = draw_url_rt_num(df)
fig.show()
def draw_uniques(df_train):
plot = go.Figure(
data=[
go.Scatterpolar(
r=[df_train.loc[df_train["bot"] == "bot"]['num_unique_words'].mean(),
df_train.loc[df_train["bot"] == "bot"]['words_vs_unique'].mean(),
df_train.loc[df_train["bot"] == "bot"]['word_unique_percent'].mean()],
theta=['mean unique words', "mean words vs unique", "mean words unique percent"],
fill='toself',
line=dict(color='red'),
name="bot values", subplot="polar3"),
go.Scatterpolar(
r=[df_train.loc[df_train["bot"] == "human"]['num_unique_words'].mean(),
df_train.loc[df_train["bot"] == "human"]['words_vs_unique'].mean(),
df_train.loc[df_train["bot"] == "human"]['word_unique_percent'].mean()],
theta=['mean unique words', "mean words vs unique", "mean words unique percent"],
fill='toself',
line=dict(color='blue'),
name="human values", subplot="polar3"),
],
layout=go.Layout(
polar=dict(
domain=dict(
x=[0, 1],
y=[0, 1]
),
radialaxis=dict(visible=True,)),
polar2=dict(
domain=dict(
x=[0, 1],
y=[0, 1]
),
radialaxis=dict(visible=True,)),
polar3=dict(
domain=dict(
x=[0, 1],
y=[0, 1]
),
radialaxis=dict(visible=True,)),
)
)
return plot
fig = draw_uniques(df)
fig.show()
def draw_punctuation(df_train):
freq_plot = go.Figure(
data=[
go.Scatterpolar(
r=[df_train.loc[df_train["bot"] == "bot"]['num_exclamation_marks'].mean(),
df_train.loc[df_train["bot"] == "bot"]['num_question_marks'].mean(),
df_train.loc[df_train["bot"] == "bot"]['num_punctuation'].mean(),
df_train.loc[df_train["bot"] == "bot"]['num_symbols'].mean()],
theta=['mean exclamation marks', "mean question marks", "mean punctuation", "mean symbols"],
fill='toself',
line=dict(color='red'),
name="bot values", subplot="polar4"),
go.Scatterpolar(
r=[df_train.loc[df_train["bot"] == "human"]['num_exclamation_marks'].mean(),
df_train.loc[df_train["bot"] == "human"]['num_question_marks'].mean(),
df_train.loc[df_train["bot"] == "human"]['num_punctuation'].mean(),
df_train.loc[df_train["bot"] == "human"]['num_symbols'].mean()],
theta=['mean exclamation marks', "mean question marks", "mean punctuation", "mean symbols"],
fill='toself',
line=dict(color='blue'),
name="human values", subplot="polar4"),
],
layout=go.Layout(
polar=dict(
domain=dict(
x=[0, 1],
y=[0, 1]
),
radialaxis=dict(visible=True,)),
polar2=dict(
domain=dict(
x=[0, 1],
y=[0, 1]
),
radialaxis=dict(visible=True,)),
polar3=dict(
domain=dict(
x=[0, 1],
y=[0, 1]
),
radialaxis=dict(visible=True,)),
polar4=dict(
domain=dict(
x=[0, 1],
y=[0, 1]
),
radialaxis=dict(visible=True,)),
)
)
return freq_plot
fig = draw_punctuation(df)
fig.show()
word_features = ["word_count", "char_count", "word_density", "total_length", "capitals", "caps_vs_length"]
punctuation_features = ["num_exclamation_marks", "num_question_marks", "num_punctuation", "num_symbols"]
uniques_features = ["num_unique_words", "words_vs_unique", "word_unique_percent"]
means_features = ["num_retweet", "num_url", "num_number"]
all_new_features = word_features + punctuation_features + uniques_features + means_features
for feat in all_new_features:
bot_mean = df.loc[df["bot"] == "bot"][feat].mean()
bot_std = df.loc[df["bot"] == "bot"][feat].std()
print(f"{feat}[bot] mean: {bot_mean} | stddev: {bot_std}")
human_mean = df.loc[df["bot"] == "human"][feat].mean()
human_std = df.loc[df["bot"] == "human"][feat].std()
print(f"{feat}[human] mean: {human_mean} | stddev: {human_std}")
print(f"mean absolute difference: {abs(bot_mean - human_mean)} | stddev absolute difference: {abs(bot_std - human_std)}")
print()
sns.set(style="whitegrid")
def plot_violins_all(dataframe):
for feature in all_new_features:
ax = sns.violinplot(x="bot",
y=feature,
hue="bot",
data=dataframe)
plt.show()
plot_violins_all(df)
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
ct_full = ColumnTransformer([
("all_features", StandardScaler(), all_new_features),
])
df[all_new_features] = ct_full.fit_transform(df)
ct_words = ColumnTransformer([
("word_cnt", StandardScaler(), all_new_features),
])
df_train[all_new_features] = ct_words.fit_transform(df_train)
#print(f"train_transformed:\n{df_train}")
df_test[all_new_features] = ct_words.transform(df_test)
#print(f"\n\ntest_transformed:\n{df_test_transformed}")
#df_test
df_train
# Draw frames after scaling
# draw train data only
frame = df_train
fig = draw_word_features(frame)
fig.show()
fig = draw_url_rt_num(frame)
fig.show()
fig = draw_uniques(frame)
fig.show()
fig = draw_punctuation(frame)
fig.show()
# Draw frames after scaling
# draw test data only
frame = df_test
fig = draw_word_features(frame)
fig.show()
fig = draw_url_rt_num(frame)
fig.show()
fig = draw_uniques(frame)
fig.show()
fig = draw_punctuation(frame)
fig.show()
# Plot dataframe full
frame = df
fig = draw_word_features(frame)
fig.show()
fig = draw_url_rt_num(frame)
fig.show()
fig = draw_uniques(frame)
fig.show()
fig = draw_punctuation(frame)
fig.show()
plot_violins_all(df)
df_train.to_pickle("./pan19_df_clean_train_full_features.pkl")
df_test.to_pickle("./pan19_df_clean_test_full_features.pkl")